In [1]:
    
import pandas as pd
def time_convert(x):
    """ Converti una stringa dal formato hh:mm:ss in nu"""
    try:
        times = x.split(':')
        return (3600*int(times[0])+60*int(times[1]))+int(times[2])
    except:
        return float('nan')
    
def ReadParseData(filename):
    # E` necessario convertire il tempo di gara in secondi, per poterlo confrontare nelle regressioni
    Cs = {'Official Time': time_convert, '5K': time_convert, 'M/F': lambda x: int(x == 'M')}    
    # EQUIVALENTE A:
    #Cs = dict() # oppure Cs = {}
    #Cs['Official Time'] = time_convert
    #Cs['M/F'] = lambda x: int(x == 'M')
    
    # Leggere la documentazione di "read_csv":
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    bm = pd.read_csv(filename, converters=Cs)
    
    # SCARTA LE COLONNE CHE NON SERVONO
    #bm.drop(bm.columns[[0,1,4,5,8,9]], axis=1, inplace=True)
    #bm.drop(bm.columns[[2,3,4,5,6,7,8,9,10,11,12,13]], axis=1, inplace=True)
    # OPPURE: Dedici quali serie tenere
    bm = bm[['Age','M/F','5K', 'Official Time','Overall','Gender','Division']]
    
    # Remove
    print('Numero dati PRIMA del preprocessing:', len(bm))
    bm = bm[bm['5K'] > 0]
    print('Numero dati DOPO il preprocessing:', len(bm))
    return bm
    
bm = ReadParseData('./data/marathon_results_2016.csv')
# STAMPA LE PRIME 3 RIGHE DEL DATA FRAME
#bm[:3]
bm[27:36]
    
    
    Out[1]:
In [2]:
    
import numpy as np
import matplotlib.pyplot as plt
def ScatterPlot(bm, Feature1, Feature2):
    sub = bm.copy()
    
    # Seleziona feature da plottare
    ym = sub[(sub['M/F'] == 1)][Feature1]
    xm = sub[(sub['M/F'] == 1)][Feature2]
    yf = sub[sub['M/F'] == 0][Feature1]
    xf = sub[sub['M/F'] == 0][Feature2]
    # Disegna il plot
    fig, ax = plt.subplots(figsize=(13, 7))
    ax.scatter(xm, ym, alpha=0.2, c='blue')
    ax.scatter(xf, yf, alpha=0.2, c='red')
    ax.legend(('Male', 'Female'))
    plt.show()
    
ScatterPlot(bm, 'Official Time', 'Gender')
ScatterPlot(bm, 'Official Time', 'Age')
    
    
    
In [3]:
    
import numpy as np
import matplotlib.pyplot as plt
def FilterPlot(F1, F2, threshold):
    # Filtra il dataframe
    sub = bm[bm.Gender < threshold]
    
    ym = sub[(sub['M/F'] == 1)][F1]
    xm = sub[(sub['M/F'] == 1)][F2]
    yf = sub[sub['M/F'] == 0][F1]
    xf = sub[sub['M/F'] == 0][F2]
    # Disegna il plot
    fig, ax = plt.subplots(figsize=(13, 7))
    ax.scatter(xm, ym, alpha=0.3, c='blue')
    ax.scatter(xf, yf, alpha=0.3, c='red')
    ax.legend(('Male', 'Female'))
    plt.show()
    
FilterPlot('Official Time', 'Age', 5000)
FilterPlot('M/F', 'Official Time', 5000)
    
    
    
In [4]:
    
import seaborn as sns
def PlotStrip(bm, threshold=1000):
    # Filtra il dataframe
    sub = bm[bm.Gender < threshold]    
    sns.stripplot(y='Official Time', x='M/F', data=sub, jitter=True)
    sns.plt.show()
PlotStrip(bm)
    
    
In [5]:
    
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
def GenerateTrainTestSet(bm, Fs, F2, threshold=200000):
    sub = bm[bm.Gender < threshold]
    x_train, x_test, y_train, y_test = train_test_split(sub[Fs], sub[F2], random_state=0)
    return x_train, x_test, y_train, y_test
x_train, x_test, y_train, y_test = GenerateTrainTestSet(bm, ['Official Time'], 'M/F')
    
In [6]:
    
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
def PrintEvaluation(y_test, y_pred):
    try:
        print('MAE:', mean_absolute_error(y_test, y_pred))
        print('MSE:', mean_squared_error(y_test, y_pred))
        print('R2:', r2_score(y_test, y_pred))    
        print('ACCURACY:', accuracy_score(y_test, y_pred))
        print('REPORT:',classification_report(y_test, y_pred))
        print('CM:', confusion_matrix(y_test, y_pred))
    except:
        print('Errore nel calcolo delle statistiche: Debug il tuo codice')
    
In [7]:
    
def PlotPredictions(x_test, y_test, y_pred):
    # Plot valori di test
    fig, ax = plt.subplots(figsize=(13, 7))
    ax.scatter(x_test, y_test, alpha=0.3, c='blue')
    
    # Plot valori predetti
    fig, ax = plt.subplots(figsize=(13, 7))
    ax.scatter(x_test, y_pred, alpha=0.3, c='red')
    plt.show()
    
In [8]:
    
from sklearn.linear_model import LinearRegression    
def RunLinearRegression(x_train, x_test, y_train):
    lr = LinearRegression(normalize=False)
    # Input to this function must be "DataFrames"
    lr.fit(x_train, y_train)
    y_pred = lr.predict(x_test)
    y_pred = [1 if p > 0.5 else 0 for p in y_pred]
    return y_pred
y_pred = RunLinearRegression(x_train, x_test, y_train)
PlotPredictions(x_test, y_test, y_pred)
PrintEvaluation(y_test, y_pred)
    
    
    
    
In [9]:
    
import seaborn as sns
sns.jointplot(data=bm, x='Official Time', y='M/F', kind='reg', color='g')
sns.plt.show()
    
    
    
In [10]:
    
from sklearn.linear_model import LogisticRegression
def RunLogisticRegression(x_train, x_test, y_train):
    logit = LogisticRegression(penalty='l2', class_weight='balanced')
    # Input to this function must be "DataFrames"
    logit.fit(x_train, y_train)
    y_pred = logit.predict_proba(x_test)
    print(y_pred[:3])
    y_pred = [1 if p[0] < p[1] else 0 for p in y_pred]
    
    return y_pred
    
y_pred = RunLogisticRegression(x_train, x_test, y_train)
PrintEvaluation(y_test, y_pred)
    
    
In [11]:
    
sns.jointplot(data=bm, x='Official Time', y='M/F', kind='reg', color='g', logistic=True)
sns.plt.show()
    
    
    
In [12]:
    
from sklearn import neighbors
def RunNeighborClassifier(x_train, x_test, y_train):
    knn = neighbors.KNeighborsClassifier(n_neighbors=5)
    # Input to this function must be "DataFrames"
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    return y_pred
y_pred = RunNeighborClassifier(x_train, x_test, y_train)
PrintEvaluation(y_test, y_pred)